
/***************************************************************************
                                                                           *
Copyright 2013 CertiVox UK Ltd.                                           *
                                                                           *
This file is part of CertiVox MIRACL Crypto SDK.                           *
                                                                           *
The CertiVox MIRACL Crypto SDK provides developers with an                 *
extensive and efficient set of cryptographic functions.                    *
For further information about its features and functionalities please      *
refer to http://www.certivox.com                                           *
                                                                           *
* The CertiVox MIRACL Crypto SDK is free software: you can                 *
  redistribute it and/or modify it under the terms of the                  *
  GNU Affero General Public License as published by the                    *
  Free Software Foundation, either version 3 of the License,               *
  or (at your option) any later version.                                   *
                                                                           *
* The CertiVox MIRACL Crypto SDK is distributed in the hope                *
  that it will be useful, but WITHOUT ANY WARRANTY; without even the       *
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
  See the GNU Affero General Public License for more details.              *
                                                                           *
* You should have received a copy of the GNU Affero General Public         *
  License along with CertiVox MIRACL Crypto SDK.                           *
  If not, see <http://www.gnu.org/licenses/>.                              *
                                                                           *
You can be released from the requirements of the license by purchasing     *
a commercial license. Buying such a license is mandatory as soon as you    *
develop commercial activities involving the CertiVox MIRACL Crypto SDK     *
without disclosing the source code of your own applications, or shipping   *
the CertiVox MIRACL Crypto SDK with a closed source product.               *
                                                                           *
***************************************************************************/
/*
 *   Many processors support a floating-point coprocessor, which may
 *   implement a faster multiplication instruction than the corresponding
 *   integer instruction. This is the case for the 80486/Pentium processor
 *   which has a built-in co-processor. This can be exploited to give even
 *   faster performance.
 *
 *   As before the fixed modulus size to be used is pre-defined as
 *   MR_PENTIUM in mirdef.h
 *
 *   Note that since the partial products are accumulated in a 64-bit register
 *   this implies that a full-width number base (2^32) cannot be used.
 *   The maximum number base that can be used is 2^x where x is
 *   calculated such that 2^(64-2*x) > 2*MR_PENTIUM. This means that
 *   x will usually be 28 or 29
 *
 *   To use this code:-
 *
 *   (1) Define MR_PENTIUM in mirdef.h to the fixed size of the modulus
 *
 *   (2) Use as a number base the value of x calculated as shown above
 *       For example, for 512 bit exponentiation, #define MR_PENTIUM 18
 *       in mirdef.h and call mirsys(50,536870912L) in your main program.
 *       (Observe that 536870912 = 2^29, and that 18*29 = 522, big enough
 *       for 512 bit calculations).
 *
 *   (3) Use Montgomery representation when implementing your crypto-system
 *       i.e. use monty_powmod(). This will automatically call the
 *       routines in this module.
 *
 *   Note that this module generates a *lot* of code e.g. > 49kbytes for
 *   MR_PENTIUM = 36. Compile using -B switch - you will need
 *   the TASM macro-assembler. If out-of-memory, try using the TASMX /ml
 *   version of the assembler.
 *
 *   Note that it is *VITAL* that double arrays be aligned on 8-byte
 *   boundaries for maximum speed on a Pentium.
 *
 *   Many thanks are due to Paul Rubin, who suggested to me that this approach
 *   might be faster than the all-integer method described elsewhere.
 *
 *   The FP stack is primed in prepare_monty() :-
 *   magic  - (2^63+2^62)*base. By adding and then subtracting this number we
 *            get the top half of the sum.
 *   1/base - Inverse of the number base
 *   ndash  - Montgomery's constant
 */

#include "miracl.h"

#ifdef MR_PENTIUM

#if INLINE_ASM == 1
#define N       8
#define POINTER QWORD PTR
#define PBX     bx
#define PSI     si
#define PDI     di
#define PCX     cx
#endif

#if INLINE_ASM == 2
#define N       8
#define POINTER QWORD PTR
#define PBX     bx
#define PSI     si
#define PDI     di
#define PCX     cx
#endif

#if INLINE_ASM == 3
#define N       8
#define POINTER QWORD PTR
#define PBX     ebx
#define PSI     esi
#define PDI     edi
#define PCX     ecx
#endif

#ifdef INLINE_ASM
#ifndef MR_LMM
/* not implemented for large memory model 16 bit */

void fastmodmult(_MIPD_ big x, big y, big z)
{
    int ij;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    big       w0      = mr_mip->w0;
    big       modulus = mr_mip->modulus;
    mr_small *wg, *mg, *xg, *yg;
    wg = w0->w;
    mg = modulus->w;
    xg = x->w;
    yg = y->w;

    for (ij = 2 * MR_PENTIUM; ij < (int)(w0->len & MR_OBITS); ij++)
        w0->w[ij] = 0.0;
    w0->len = 2 * MR_PENTIUM;

    ASM{FSTEP MACRO i,
        j
            /* some fancy Pentium scheduling going on here ... */
            fld POINTER[PBX + N * i] fmul POINTER[PSI + N * j] fxch st(2)
                fadd ENDM

                    FRSTEP MACRO i,
        j fld POINTER[PDI + N * i] fmul POINTER[PSI + N * j] fxch st(2)
            fadd ENDM

                FDSTEP MACRO i,
        j fld POINTER[PBX + N * i] fmul POINTER[PBX + N * j] fxch st(2)
            fadd ENDM

                SELF MACRO k fld POINTER[PBX + N * k] fmul st,
        st(0) fadd ENDM

            RFINU MACRO k fld st(0)

                fadd st,
        st(2) fsub   st,
        st(2)

            fsubr  st,
        st(1) fmul st,
        st(4) fld  st(0)

            fadd   st,
        st(3) fsub st,
        st(3)

            fsub fst POINTER[PDI + N * k] fmul POINTER[PSI] fadd fmul st,
        st(2) ENDM

            RFIND MACRO k fld st(0)

                fadd st,
        st(2) fsub   st,
        st(2)

            fsub   st(1),
        st fmul    st,
        st(3) fxch st(1) fstp POINTER[PDI + N * k]

        ENDM

            DIAG MACRO ns,
        ne CNT1 = ns CNT2 =
            ne fld POINTER[PBX + N * CNT1] fmul POINTER[PSI + N * CNT2] CNT1 =
                CNT1 + 1 CNT2 = CNT2 - 1 WHILE CNT1 LE ne FSTEP CNT1,
        CNT2 CNT1 = CNT1 + 1 CNT2 = CNT2 - 1 ENDM fadd ENDM

                                               SDIAG MACRO ns,
        ne CNT1 = ns CNT2 = ne IF CNT1 LT CNT2 fstp st(5) /* store carry */
        fldz fld POINTER[PBX + N * CNT1] fmul POINTER[PBX + N * CNT2] CNT1 =
            CNT1 + 1 CNT2 = CNT2 - 1 WHILE CNT1 LT CNT2 FDSTEP CNT1,
        CNT2                                   CNT1 = CNT1 + 1 CNT2 =
                        CNT2 - 1 ENDM fadd fld st(0) /* now double it ... */
                        fadd fadd              st,
        st(5) /* add in carry */
        ENDIF ENDM

            RDIAGU MACRO ns,
        ne CNT1 = ns CNT2 = ne IF CNT1 LT ne fld
            POINTER[PDI + N * CNT1] fmul POINTER[PSI + N * CNT2] CNT1 =
                CNT1 + 1 CNT2 = CNT2 - 1 WHILE CNT1 LT ne FRSTEP CNT1,
        CNT2 CNT1 = CNT1 + 1 CNT2 = CNT2 - 1 ENDM fadd ENDIF ENDM

                                               RDIAGD MACRO ns,
        ne CNT1 = ns CNT2 =
            ne fld POINTER[PDI + N * CNT1] fmul POINTER[PSI + N * CNT2] CNT1 =
                CNT1 + 1 CNT2 = CNT2 - 1 WHILE CNT1 LE ne FRSTEP CNT1,
        CNT2 CNT1 = CNT1 + 1 CNT2 = CNT2 - 1 ENDM fadd ENDM

                                               MODMULT MACRO      CNT =
                                        0 WHILE CNT LT MR_PENTIUM DIAG 0,
        CNT xchg                                                  PSI,
        PCX                                                       RDIAGU 0,
        CNT RFINU CNT xchg                                        PSI,
        PCX CNT = CNT + 1 ENDM SCNT = 0 WHILE SCNT LT(MR_PENTIUM - 1) SCNT =
                      SCNT + 1 DIAG                SCNT,
        (MR_PENTIUM - 1) xchg                      PSI,
        PCX RDIAGD                                 SCNT,
        (MR_PENTIUM - 1) RFIND CNT xchg            PSI,
        PCX CNT = CNT + 1 ENDM RFIND CNT CNT =
                      CNT + 1 fstp       POINTER[PDI + N * CNT] ENDM

                                MODSQUARE MACRO     CNT =
                          0 WHILE CNT LT MR_PENTIUM SDIAG 0,
        CNT IF(CNT MOD 2) EQ 0 SELF(CNT / 2) ENDIF RDIAGU 0,
        CNT RFINU CNT CNT = CNT + 1 ENDM SCNT = 0 WHILE SCNT LT(MR_PENTIUM - 1)
                                SCNT          = SCNT + 1 SDIAG        SCNT,
        (MR_PENTIUM - 1) IF(CNT MOD 2) EQ 0 SELF(CNT / 2) ENDIF RDIAGD SCNT,
        (MR_PENTIUM - 1) RFIND CNT                                     CNT =
            CNT + 1 ENDM RFIND CNT                                     CNT =
                CNT + 1 fstp POINTER[PDI + N * CNT] ENDM} ASM
    {
        push PDI push PSI

            mov    PBX,
            xg mov PSI, yg mov PCX, mg mov PDI,
            wg

                fldz

                    MODMULT

                        pop PSI pop PDI
    }

    for (ij = MR_PENTIUM; ij < (int)(z->len & MR_OBITS); ij++) z->w[ij] = 0.0;
    z->len = MR_PENTIUM;
    for (ij = 0; ij < MR_PENTIUM; ij++) z->w[ij] = w0->w[ij + MR_PENTIUM];
    if (z->w[MR_PENTIUM - 1] == 0.0) mr_lzero(z);
}

void fastmodsquare(_MIPD_ x, z) big x, z;
{
    int ij;
#ifdef MR_OS_THREADS
    miracl* mr_mip = get_mip();
#endif
    big       w0      = mr_mip->w0;
    big       modulus = mr_mip->modulus;
    mr_small *wg, *mg, *xg;
    wg = w0->w;
    mg = modulus->w;
    xg = x->w;

    for (ij = 2 * MR_PENTIUM; ij < (int)(w0->len & MR_OBITS); ij++)
        w0->w[ij] = 0.0;
    w0->len = 2 * MR_PENTIUM;

    ASM
    {
        push PDI push PSI

            mov    PBX,
            xg mov PSI, mg mov PDI,
            wg

                fldz

                    MODSQUARE

                        pop PSI pop PDI
    }
    for (ij = MR_PENTIUM; ij < (int)(z->len & MR_OBITS); ij++) z->w[ij] = 0.0;
    z->len = MR_PENTIUM;

    for (ij = 0; ij < MR_PENTIUM; ij++) z->w[ij] = w0->w[ij + MR_PENTIUM];
    if (z->w[MR_PENTIUM - 1] == 0.0) mr_lzero(z);
}

#endif
#endif
#endif
